import datetime
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import shapiro
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import sys
import warnings
warnings.filterwarnings("ignore")
# to visualise al the columns in the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set(rc= {"figure.figsize": (12, 16)}) #width= 12, #height= 162
# check scikit-learn version
import sklearn
print("\nsklearn version {}".format(sklearn.__version__))
import sys
print("\nPython version {}".format(sys.version))
sklearn version 1.2.2 Python version 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
# builtin function for distribution of target variable
def distribution_of_target(df):
count=df["TARGET"].value_counts()
print(count)
# Percentage calculation
print("Percentage: ")
print((df["TARGET"].value_counts()/df["TARGET"].count())*100)
# Create a pie plot
plt.figure(figsize=(4, 4),tight_layout=True)
plt.pie(count, labels=["Non-Defaulter","Defaulter"], autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)
# Add a title
plt.title('Pie Plot for Target')
# Plot distribution of one feature (continuous)
def plot_distribution(data,feature,color):
plt.figure(figsize=(12,4))
plt.title(f"Distribution of {feature}s")
sns.distplot(data[feature].dropna(),color=color, kde=True,bins=100)
# Calculate basic statistics
mean_value = np.mean(data[feature])
median_value = np.median(data[feature].dropna())
skewness = data[feature].skew()
kurtosis = data[feature].kurt()
# Print summary
print("Feature:",feature,"\n")
print(f"range{min(data[feature]),max(data[feature])}")
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurtosis}")
print("")
if skewness > -0.5 and skewness < 0.5:
print("The distribution is approximately symmetric.")
elif skewness <= -0.5:
print("The distribution is negatively skewed (skewed to the left).")
elif skewness >= 0.5:
print("The distribution is positively skewed (skewed to the right).")
if kurtosis == 3:
print("The kurtosis is similar to that of a normal distribution.")
elif kurtosis < 3:
print("The distribution has lighter tails (less extreme values).")
elif kurtosis > 3:
print("The distribution has heavier tails (more extreme values).")
plt.tight_layout()
plt.show()
# normal distribution check
def normal_distribution_check(data):
print("\nNormal Distribution Check using Shapiro test: \n")
statistic, p_value = shapiro(data)
print(f"Test Statistic: {statistic}, P-value: {p_value}")
if p_value > 0.05:
print("Fail to reject the null hypothesis (data may be normally distributed)")
else:
print("Reject the null hypothesis (data may not be normally distributed)\n")
# Function to plot KDE and Box plots side by side
def plot_kde_and_box(data, feature):
sns.set(style="whitegrid")
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
# KDE plot
sns.kdeplot(data.loc[data['TARGET'] == 0, feature], label = 'Target == 0', shade=True, ax=axes[0])
sns.kdeplot(data.loc[data['TARGET'] == 1, feature], label = 'Target == 1', shade=True, ax=axes[0])
axes[0].set_title(f'Distribution of {feature} by TARGET')
axes[0].set_xlabel(feature)
axes[0].set_ylabel('Density')
axes[0].legend()
# box plot
sns.boxplot(x="TARGET", y=feature, data=data, palette="Set3", ax=axes[1])
axes[1].set_title(f"{feature} Box Plot")
axes[1].set_xlabel("TARGET")
axes[1].set_ylabel(feature)
plt.tight_layout()
plt.show()
data.boxplot(column=[feature])
plt.show()
### Visualizing a Pandas Correlation Matrix Using Seaborn
def corr_matrix(df):
matrix = df.corr().round(2)
sns.heatmap(matrix, annot=True)
plt.show()
### Identify outliers
def identify_outliers(df, var):
# Interquartile Range (IQR)
# Calculate the upper and lower limits
Q1 = df[var].quantile(0.25)
Q3 = df[var].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df.boxplot(column = [var])
plt.show()
return (lower, upper)
### WE will cap by using winsorization technique using IQR method.
def cap_outliers(df, VAR):
p25 = df[VAR].quantile(0.25)
p75 = df[VAR].quantile(0.75)
IQR = p75 - p25
ulim = p75 + (1.5 * IQR)
llim = p25 - (1.5 * IQR)
df[VAR] = df[VAR].clip(lower = llim, upper = ulim)
return df[VAR]
df = pd.read_csv("./Data/Cleaned_housing_loan_data_240126.csv", encoding = 'Latin-1')
df["DAYS_BIRTH"] = abs(round(df["DAYS_BIRTH"]/365))
df["DAYS_REGISTRATION"] = abs(round(df["DAYS_REGISTRATION"]/365))
df["DAYS_ID_PUBLISH"] = abs(round(df["DAYS_ID_PUBLISH"]/365))
df["PA_DAYS_DECISION_max_y"] = abs(round(df["PA_DAYS_DECISION_max_y"]/365))
df["IP_DAYS_INSTALMENT_min"] = abs(round(df["IP_DAYS_INSTALMENT_min"]/365))
print("These are the dimensions of our data:\n")
print(df.shape,"\n")
print("Columns in our dataset:\n")
print(df.columns.values,"\n")
print("Count of data types:\n")
print(df.dtypes.value_counts(),"\n")
print("No of object features: ",len(df.select_dtypes('object').columns))
These are the dimensions of our data: (307511, 16) Columns in our dataset: ['AMT_ANNUITY' 'DAYS_BIRTH' 'DAYS_REGISTRATION' 'DAYS_ID_PUBLISH' 'EXT_SOURCE_2' 'EXT_SOURCE_3' 'b_DAYS_CREDIT_max' 'PA_AMT_CREDIT_sum' 'PA_DAYS_DECISION_max_y' 'IP_DAYS_INSTALMENT_min' 'IP_AMT_INSTALMENT_sum' 'IP_AMT_INSTALMENT_avg' 'AT_ANNUITY_INCOME_PERCENT' 'AT_CREDIT_TERM' 'AT_DAYS_EMPLOYED_PERCENT' 'TARGET'] Count of data types: float64 15 int64 1 Name: count, dtype: int64 No of object features: 0
print("TARGET value 0 means loan is repaid, value 1 means loan is not repaid.\n")
print("Count of classes in the Target variable {}".format(df['TARGET'].value_counts()))
TARGET value 0 means loan is repaid, value 1 means loan is not repaid. Count of classes in the Target variable TARGET 0 282686 1 24825 Name: count, dtype: int64
distribution_of_target(df)
TARGET 0 282686 1 24825 Name: count, dtype: int64 Percentage: TARGET 0 91.927118 1 8.072882 Name: count, dtype: float64
We have 307511 rows and 16 columns and all are numeric columns.
Our target variable, is having two classes 0 (Non-defaulter) and 1 (Defaulter) with count and % as follows:
data = df.copy()
num_df = data.select_dtypes(include=['number'])
num_vars_ = num_df.columns.tolist()
num_vars = [x for i, x in enumerate(num_vars_) if i != 15] # remove the last element
print(num_vars)
print(len(num_vars))
['AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'b_DAYS_CREDIT_max', 'PA_AMT_CREDIT_sum', 'PA_DAYS_DECISION_max_y', 'IP_DAYS_INSTALMENT_min', 'IP_AMT_INSTALMENT_sum', 'IP_AMT_INSTALMENT_avg', 'AT_ANNUITY_INCOME_PERCENT', 'AT_CREDIT_TERM', 'AT_DAYS_EMPLOYED_PERCENT'] 15
feature_num = 1
for each in num_vars:
print("\n%d: Feature %s " %(feature_num, each))
feature_num += 1
plot_kde_and_box(data,each)
1: Feature AMT_ANNUITY
2: Feature DAYS_BIRTH
3: Feature DAYS_REGISTRATION
4: Feature DAYS_ID_PUBLISH
5: Feature EXT_SOURCE_2
6: Feature EXT_SOURCE_3
7: Feature b_DAYS_CREDIT_max
8: Feature PA_AMT_CREDIT_sum
9: Feature PA_DAYS_DECISION_max_y
10: Feature IP_DAYS_INSTALMENT_min
11: Feature IP_AMT_INSTALMENT_sum
12: Feature IP_AMT_INSTALMENT_avg
13: Feature AT_ANNUITY_INCOME_PERCENT
14: Feature AT_CREDIT_TERM
15: Feature AT_DAYS_EMPLOYED_PERCENT
1) The AMT_ANNUITY for Defaulter is almost same as that of Non-Defaulters. There are many outliers for both Defaulters and Non-Defaulters.
2) The DAYS_BIRTH now converted to years since birth indicating the Age of Defaulters is usually lesser than the Non-Defaulters. All the quantiles of ages of Defaulters is lesser than Non-Defaulters. The 75th percentile value of Non-Defaulters is around 54 years while for Defaulters it is near to 49 years. There are no outliers for both Defaulters and Non-Defaulters. These observations imply that the Defaulters are usually younger than Non-Defaulters.
3) The DAYS_REGISTRATION for Defaulter is almost same as that of Non-Defaulters. There are many outliers for both Defaulters and Non-Defaulters.
4) The DAYS_ID_PUBLISH indicates the number of days before the application did client change the identity document with which he applied for the loan. DAYS_ID_PUBLISH for Defaulter is slightly less than Non-Defaulters. There are no outliers for both Defaulters and Non-Defaulters.
**We infer that defaulters change their identity documents more recently than non-defaulters.**
5) The EXT_SOURCE_2 for defaulter is less than the Non-Defaulters. It is interesting to note that the median value for defaulters is almost equal to or lower than 25th percentile values of Non-Defaulters. There are no outliers for both Defaulters and Non-Defaulters.
6) The EXT_SOURCE_3 for defaulter is less than the Non-Defaulters. It is interesting to note that the median value for defaulters is almost equal to or lower than 25th percentile values of Non-Defaulters. There are outliers for Non-Defaulters.
7) The b_DAYS_CREDIT_max for defaulter is more than the Non-Defaulters. There are no outliers for both Defaulters and Non-Defaulters.
8) The PA_AMT_CREDIT_sum for defaulter is almost same as that of Non-Defaulters. There are no outliers for both Defaulters and Non-Defaulters.
9) The PA_DAYS_DECISION_max_y for defaulter is visibly more than that of Non-Defaulters. It is interesting to note that the Whisker end of Defaulter is almost sane as that of Q3 of Non-Defaulter. There are outliers for both Defaulters and Non-Defaulters.
10) The IP_DAYS_INSTALMENT_min for defaulter is less than the Non-Defaulters. There are outliers for both Defaulters and Non-Defaulters.
11) The IP_AMT_INSTALMENT_sum for defaulter is almost same as that of Non-Defaulters. There are outliers for both Defaulters and Non-Defaulters.
12) The IP_AMT_INSTALMENT_avg for defaulter is almost same as that of Non-Defaulters. There are outliers for both Defaulters and Non-Defaulters.
13) The AT_ANNUITY_INCOME_PERCENT for defaulter is almost same as that of Non-Defaulters. There are outliers for both Defaulters and Non-Defaulters.
14) The AT_CREDIT_TERM for defaulter is almost same as that of Non-Defaulters. There are outliers for both Defaulters and Non-Defaulters.
15) The AT_DAYS_EMPLOYED_PERCENT for defaulter is almost same as that of Non-Defaulters. There are outliers for both Defaulters and Non-Defaulters.
We shall check for the correlation among numeric independent variables
A correlation matrix is a common tool used to compare the coefficients of correlation between different features (or attributes) in a dataset. It allows us to visualize how much (or how little) correlation exists between different variables.
corr_matrix(data)
corr_matrix_ = data.corr().round(2)
corr_matrix__ = corr_matrix_.drop(['TARGET'], axis='index')
corr_matrix = corr_matrix__.drop(['TARGET'], axis = 1)
print(corr_matrix)
AMT_ANNUITY DAYS_BIRTH DAYS_REGISTRATION \
AMT_ANNUITY 1.00 -0.01 -0.04
DAYS_BIRTH -0.01 1.00 0.33
DAYS_REGISTRATION -0.04 0.33 1.00
DAYS_ID_PUBLISH -0.01 0.27 0.10
EXT_SOURCE_2 0.13 0.09 0.06
EXT_SOURCE_3 0.03 0.18 0.10
b_DAYS_CREDIT_max -0.04 -0.09 -0.05
PA_AMT_CREDIT_sum 0.10 0.12 0.02
PA_DAYS_DECISION_max_y 0.02 0.03 0.04
IP_DAYS_INSTALMENT_min 0.02 0.13 0.07
IP_AMT_INSTALMENT_sum 0.14 0.15 0.03
IP_AMT_INSTALMENT_avg 0.16 0.06 -0.01
AT_ANNUITY_INCOME_PERCENT 0.48 0.08 0.03
AT_CREDIT_TERM -0.06 -0.09 -0.03
AT_DAYS_EMPLOYED_PERCENT 0.10 -0.59 -0.20
DAYS_ID_PUBLISH EXT_SOURCE_2 EXT_SOURCE_3 \
AMT_ANNUITY -0.01 0.13 0.03
DAYS_BIRTH 0.27 0.09 0.18
DAYS_REGISTRATION 0.10 0.06 0.10
DAYS_ID_PUBLISH 1.00 0.05 0.11
EXT_SOURCE_2 0.05 1.00 0.09
EXT_SOURCE_3 0.11 0.09 1.00
b_DAYS_CREDIT_max -0.08 -0.05 -0.31
PA_AMT_CREDIT_sum 0.03 0.04 -0.08
PA_DAYS_DECISION_max_y 0.03 0.02 0.09
IP_DAYS_INSTALMENT_min 0.12 0.05 0.09
IP_AMT_INSTALMENT_sum 0.04 0.06 -0.01
IP_AMT_INSTALMENT_avg 0.01 0.06 -0.00
AT_ANNUITY_INCOME_PERCENT 0.02 -0.04 0.09
AT_CREDIT_TERM -0.02 -0.05 0.00
AT_DAYS_EMPLOYED_PERCENT -0.25 0.02 -0.10
b_DAYS_CREDIT_max PA_AMT_CREDIT_sum \
AMT_ANNUITY -0.04 0.10
DAYS_BIRTH -0.09 0.12
DAYS_REGISTRATION -0.05 0.02
DAYS_ID_PUBLISH -0.08 0.03
EXT_SOURCE_2 -0.05 0.04
EXT_SOURCE_3 -0.31 -0.08
b_DAYS_CREDIT_max 1.00 0.02
PA_AMT_CREDIT_sum 0.02 1.00
PA_DAYS_DECISION_max_y -0.03 -0.18
IP_DAYS_INSTALMENT_min -0.05 0.23
IP_AMT_INSTALMENT_sum -0.02 0.65
IP_AMT_INSTALMENT_avg -0.00 0.31
AT_ANNUITY_INCOME_PERCENT -0.05 -0.07
AT_CREDIT_TERM 0.00 -0.04
AT_DAYS_EMPLOYED_PERCENT 0.02 -0.02
PA_DAYS_DECISION_max_y IP_DAYS_INSTALMENT_min \
AMT_ANNUITY 0.02 0.02
DAYS_BIRTH 0.03 0.13
DAYS_REGISTRATION 0.04 0.07
DAYS_ID_PUBLISH 0.03 0.12
EXT_SOURCE_2 0.02 0.05
EXT_SOURCE_3 0.09 0.09
b_DAYS_CREDIT_max -0.03 -0.05
PA_AMT_CREDIT_sum -0.18 0.23
PA_DAYS_DECISION_max_y 1.00 0.26
IP_DAYS_INSTALMENT_min 0.26 1.00
IP_AMT_INSTALMENT_sum -0.12 0.35
IP_AMT_INSTALMENT_avg -0.07 -0.02
AT_ANNUITY_INCOME_PERCENT 0.08 0.03
AT_CREDIT_TERM -0.01 -0.03
AT_DAYS_EMPLOYED_PERCENT -0.00 -0.02
IP_AMT_INSTALMENT_sum IP_AMT_INSTALMENT_avg \
AMT_ANNUITY 0.14 0.16
DAYS_BIRTH 0.15 0.06
DAYS_REGISTRATION 0.03 -0.01
DAYS_ID_PUBLISH 0.04 0.01
EXT_SOURCE_2 0.06 0.06
EXT_SOURCE_3 -0.01 -0.00
b_DAYS_CREDIT_max -0.02 -0.00
PA_AMT_CREDIT_sum 0.65 0.31
PA_DAYS_DECISION_max_y -0.12 -0.07
IP_DAYS_INSTALMENT_min 0.35 -0.02
IP_AMT_INSTALMENT_sum 1.00 0.41
IP_AMT_INSTALMENT_avg 0.41 1.00
AT_ANNUITY_INCOME_PERCENT -0.05 -0.01
AT_CREDIT_TERM -0.05 -0.02
AT_DAYS_EMPLOYED_PERCENT -0.03 0.00
AT_ANNUITY_INCOME_PERCENT AT_CREDIT_TERM \
AMT_ANNUITY 0.48 -0.06
DAYS_BIRTH 0.08 -0.09
DAYS_REGISTRATION 0.03 -0.03
DAYS_ID_PUBLISH 0.02 -0.02
EXT_SOURCE_2 -0.04 -0.05
EXT_SOURCE_3 0.09 0.00
b_DAYS_CREDIT_max -0.05 0.00
PA_AMT_CREDIT_sum -0.07 -0.04
PA_DAYS_DECISION_max_y 0.08 -0.01
IP_DAYS_INSTALMENT_min 0.03 -0.03
IP_AMT_INSTALMENT_sum -0.05 -0.05
IP_AMT_INSTALMENT_avg -0.01 -0.02
AT_ANNUITY_INCOME_PERCENT 1.00 -0.03
AT_CREDIT_TERM -0.03 1.00
AT_DAYS_EMPLOYED_PERCENT -0.08 0.02
AT_DAYS_EMPLOYED_PERCENT
AMT_ANNUITY 0.10
DAYS_BIRTH -0.59
DAYS_REGISTRATION -0.20
DAYS_ID_PUBLISH -0.25
EXT_SOURCE_2 0.02
EXT_SOURCE_3 -0.10
b_DAYS_CREDIT_max 0.02
PA_AMT_CREDIT_sum -0.02
PA_DAYS_DECISION_max_y -0.00
IP_DAYS_INSTALMENT_min -0.02
IP_AMT_INSTALMENT_sum -0.03
IP_AMT_INSTALMENT_avg 0.00
AT_ANNUITY_INCOME_PERCENT -0.08
AT_CREDIT_TERM 0.02
AT_DAYS_EMPLOYED_PERCENT 1.00
corr_matrix
| AMT_ANNUITY | DAYS_BIRTH | DAYS_REGISTRATION | DAYS_ID_PUBLISH | EXT_SOURCE_2 | EXT_SOURCE_3 | b_DAYS_CREDIT_max | PA_AMT_CREDIT_sum | PA_DAYS_DECISION_max_y | IP_DAYS_INSTALMENT_min | IP_AMT_INSTALMENT_sum | IP_AMT_INSTALMENT_avg | AT_ANNUITY_INCOME_PERCENT | AT_CREDIT_TERM | AT_DAYS_EMPLOYED_PERCENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AMT_ANNUITY | 1.00 | -0.01 | -0.04 | -0.01 | 0.13 | 0.03 | -0.04 | 0.10 | 0.02 | 0.02 | 0.14 | 0.16 | 0.48 | -0.06 | 0.10 |
| DAYS_BIRTH | -0.01 | 1.00 | 0.33 | 0.27 | 0.09 | 0.18 | -0.09 | 0.12 | 0.03 | 0.13 | 0.15 | 0.06 | 0.08 | -0.09 | -0.59 |
| DAYS_REGISTRATION | -0.04 | 0.33 | 1.00 | 0.10 | 0.06 | 0.10 | -0.05 | 0.02 | 0.04 | 0.07 | 0.03 | -0.01 | 0.03 | -0.03 | -0.20 |
| DAYS_ID_PUBLISH | -0.01 | 0.27 | 0.10 | 1.00 | 0.05 | 0.11 | -0.08 | 0.03 | 0.03 | 0.12 | 0.04 | 0.01 | 0.02 | -0.02 | -0.25 |
| EXT_SOURCE_2 | 0.13 | 0.09 | 0.06 | 0.05 | 1.00 | 0.09 | -0.05 | 0.04 | 0.02 | 0.05 | 0.06 | 0.06 | -0.04 | -0.05 | 0.02 |
| EXT_SOURCE_3 | 0.03 | 0.18 | 0.10 | 0.11 | 0.09 | 1.00 | -0.31 | -0.08 | 0.09 | 0.09 | -0.01 | -0.00 | 0.09 | 0.00 | -0.10 |
| b_DAYS_CREDIT_max | -0.04 | -0.09 | -0.05 | -0.08 | -0.05 | -0.31 | 1.00 | 0.02 | -0.03 | -0.05 | -0.02 | -0.00 | -0.05 | 0.00 | 0.02 |
| PA_AMT_CREDIT_sum | 0.10 | 0.12 | 0.02 | 0.03 | 0.04 | -0.08 | 0.02 | 1.00 | -0.18 | 0.23 | 0.65 | 0.31 | -0.07 | -0.04 | -0.02 |
| PA_DAYS_DECISION_max_y | 0.02 | 0.03 | 0.04 | 0.03 | 0.02 | 0.09 | -0.03 | -0.18 | 1.00 | 0.26 | -0.12 | -0.07 | 0.08 | -0.01 | -0.00 |
| IP_DAYS_INSTALMENT_min | 0.02 | 0.13 | 0.07 | 0.12 | 0.05 | 0.09 | -0.05 | 0.23 | 0.26 | 1.00 | 0.35 | -0.02 | 0.03 | -0.03 | -0.02 |
| IP_AMT_INSTALMENT_sum | 0.14 | 0.15 | 0.03 | 0.04 | 0.06 | -0.01 | -0.02 | 0.65 | -0.12 | 0.35 | 1.00 | 0.41 | -0.05 | -0.05 | -0.03 |
| IP_AMT_INSTALMENT_avg | 0.16 | 0.06 | -0.01 | 0.01 | 0.06 | -0.00 | -0.00 | 0.31 | -0.07 | -0.02 | 0.41 | 1.00 | -0.01 | -0.02 | 0.00 |
| AT_ANNUITY_INCOME_PERCENT | 0.48 | 0.08 | 0.03 | 0.02 | -0.04 | 0.09 | -0.05 | -0.07 | 0.08 | 0.03 | -0.05 | -0.01 | 1.00 | -0.03 | -0.08 |
| AT_CREDIT_TERM | -0.06 | -0.09 | -0.03 | -0.02 | -0.05 | 0.00 | 0.00 | -0.04 | -0.01 | -0.03 | -0.05 | -0.02 | -0.03 | 1.00 | 0.02 |
| AT_DAYS_EMPLOYED_PERCENT | 0.10 | -0.59 | -0.20 | -0.25 | 0.02 | -0.10 | 0.02 | -0.02 | -0.00 | -0.02 | -0.03 | 0.00 | -0.08 | 0.02 | 1.00 |
corr_matrix[corr_matrix > .7]
# Ref: https://blog.clairvoyantsoft.com/correlation-and-collinearity-how-they-can-make-or-break-a-model-9135fbe6936a
| AMT_ANNUITY | DAYS_BIRTH | DAYS_REGISTRATION | DAYS_ID_PUBLISH | EXT_SOURCE_2 | EXT_SOURCE_3 | b_DAYS_CREDIT_max | PA_AMT_CREDIT_sum | PA_DAYS_DECISION_max_y | IP_DAYS_INSTALMENT_min | IP_AMT_INSTALMENT_sum | IP_AMT_INSTALMENT_avg | AT_ANNUITY_INCOME_PERCENT | AT_CREDIT_TERM | AT_DAYS_EMPLOYED_PERCENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AMT_ANNUITY | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DAYS_BIRTH | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DAYS_REGISTRATION | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DAYS_ID_PUBLISH | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EXT_SOURCE_2 | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EXT_SOURCE_3 | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| b_DAYS_CREDIT_max | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| PA_AMT_CREDIT_sum | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| PA_DAYS_DECISION_max_y | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| IP_DAYS_INSTALMENT_min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN |
| IP_AMT_INSTALMENT_sum | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN |
| IP_AMT_INSTALMENT_avg | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN |
| AT_ANNUITY_INCOME_PERCENT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN |
| AT_CREDIT_TERM | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN |
| AT_DAYS_EMPLOYED_PERCENT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
colsnm = corr_matrix.columns
colsnm
Index(['AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
'EXT_SOURCE_2', 'EXT_SOURCE_3', 'b_DAYS_CREDIT_max',
'PA_AMT_CREDIT_sum', 'PA_DAYS_DECISION_max_y', 'IP_DAYS_INSTALMENT_min',
'IP_AMT_INSTALMENT_sum', 'IP_AMT_INSTALMENT_avg',
'AT_ANNUITY_INCOME_PERCENT', 'AT_CREDIT_TERM',
'AT_DAYS_EMPLOYED_PERCENT'],
dtype='object')
for each in colsnm:
print("\n {}".format(each))
print(corr_matrix.loc[:, each])
AMT_ANNUITY AMT_ANNUITY 1.00 DAYS_BIRTH -0.01 DAYS_REGISTRATION -0.04 DAYS_ID_PUBLISH -0.01 EXT_SOURCE_2 0.13 EXT_SOURCE_3 0.03 b_DAYS_CREDIT_max -0.04 PA_AMT_CREDIT_sum 0.10 PA_DAYS_DECISION_max_y 0.02 IP_DAYS_INSTALMENT_min 0.02 IP_AMT_INSTALMENT_sum 0.14 IP_AMT_INSTALMENT_avg 0.16 AT_ANNUITY_INCOME_PERCENT 0.48 AT_CREDIT_TERM -0.06 AT_DAYS_EMPLOYED_PERCENT 0.10 Name: AMT_ANNUITY, dtype: float64 DAYS_BIRTH AMT_ANNUITY -0.01 DAYS_BIRTH 1.00 DAYS_REGISTRATION 0.33 DAYS_ID_PUBLISH 0.27 EXT_SOURCE_2 0.09 EXT_SOURCE_3 0.18 b_DAYS_CREDIT_max -0.09 PA_AMT_CREDIT_sum 0.12 PA_DAYS_DECISION_max_y 0.03 IP_DAYS_INSTALMENT_min 0.13 IP_AMT_INSTALMENT_sum 0.15 IP_AMT_INSTALMENT_avg 0.06 AT_ANNUITY_INCOME_PERCENT 0.08 AT_CREDIT_TERM -0.09 AT_DAYS_EMPLOYED_PERCENT -0.59 Name: DAYS_BIRTH, dtype: float64 DAYS_REGISTRATION AMT_ANNUITY -0.04 DAYS_BIRTH 0.33 DAYS_REGISTRATION 1.00 DAYS_ID_PUBLISH 0.10 EXT_SOURCE_2 0.06 EXT_SOURCE_3 0.10 b_DAYS_CREDIT_max -0.05 PA_AMT_CREDIT_sum 0.02 PA_DAYS_DECISION_max_y 0.04 IP_DAYS_INSTALMENT_min 0.07 IP_AMT_INSTALMENT_sum 0.03 IP_AMT_INSTALMENT_avg -0.01 AT_ANNUITY_INCOME_PERCENT 0.03 AT_CREDIT_TERM -0.03 AT_DAYS_EMPLOYED_PERCENT -0.20 Name: DAYS_REGISTRATION, dtype: float64 DAYS_ID_PUBLISH AMT_ANNUITY -0.01 DAYS_BIRTH 0.27 DAYS_REGISTRATION 0.10 DAYS_ID_PUBLISH 1.00 EXT_SOURCE_2 0.05 EXT_SOURCE_3 0.11 b_DAYS_CREDIT_max -0.08 PA_AMT_CREDIT_sum 0.03 PA_DAYS_DECISION_max_y 0.03 IP_DAYS_INSTALMENT_min 0.12 IP_AMT_INSTALMENT_sum 0.04 IP_AMT_INSTALMENT_avg 0.01 AT_ANNUITY_INCOME_PERCENT 0.02 AT_CREDIT_TERM -0.02 AT_DAYS_EMPLOYED_PERCENT -0.25 Name: DAYS_ID_PUBLISH, dtype: float64 EXT_SOURCE_2 AMT_ANNUITY 0.13 DAYS_BIRTH 0.09 DAYS_REGISTRATION 0.06 DAYS_ID_PUBLISH 0.05 EXT_SOURCE_2 1.00 EXT_SOURCE_3 0.09 b_DAYS_CREDIT_max -0.05 PA_AMT_CREDIT_sum 0.04 PA_DAYS_DECISION_max_y 0.02 IP_DAYS_INSTALMENT_min 0.05 IP_AMT_INSTALMENT_sum 0.06 IP_AMT_INSTALMENT_avg 0.06 AT_ANNUITY_INCOME_PERCENT -0.04 AT_CREDIT_TERM -0.05 AT_DAYS_EMPLOYED_PERCENT 0.02 Name: EXT_SOURCE_2, dtype: float64 EXT_SOURCE_3 AMT_ANNUITY 0.03 DAYS_BIRTH 0.18 DAYS_REGISTRATION 0.10 DAYS_ID_PUBLISH 0.11 EXT_SOURCE_2 0.09 EXT_SOURCE_3 1.00 b_DAYS_CREDIT_max -0.31 PA_AMT_CREDIT_sum -0.08 PA_DAYS_DECISION_max_y 0.09 IP_DAYS_INSTALMENT_min 0.09 IP_AMT_INSTALMENT_sum -0.01 IP_AMT_INSTALMENT_avg -0.00 AT_ANNUITY_INCOME_PERCENT 0.09 AT_CREDIT_TERM 0.00 AT_DAYS_EMPLOYED_PERCENT -0.10 Name: EXT_SOURCE_3, dtype: float64 b_DAYS_CREDIT_max AMT_ANNUITY -0.04 DAYS_BIRTH -0.09 DAYS_REGISTRATION -0.05 DAYS_ID_PUBLISH -0.08 EXT_SOURCE_2 -0.05 EXT_SOURCE_3 -0.31 b_DAYS_CREDIT_max 1.00 PA_AMT_CREDIT_sum 0.02 PA_DAYS_DECISION_max_y -0.03 IP_DAYS_INSTALMENT_min -0.05 IP_AMT_INSTALMENT_sum -0.02 IP_AMT_INSTALMENT_avg -0.00 AT_ANNUITY_INCOME_PERCENT -0.05 AT_CREDIT_TERM 0.00 AT_DAYS_EMPLOYED_PERCENT 0.02 Name: b_DAYS_CREDIT_max, dtype: float64 PA_AMT_CREDIT_sum AMT_ANNUITY 0.10 DAYS_BIRTH 0.12 DAYS_REGISTRATION 0.02 DAYS_ID_PUBLISH 0.03 EXT_SOURCE_2 0.04 EXT_SOURCE_3 -0.08 b_DAYS_CREDIT_max 0.02 PA_AMT_CREDIT_sum 1.00 PA_DAYS_DECISION_max_y -0.18 IP_DAYS_INSTALMENT_min 0.23 IP_AMT_INSTALMENT_sum 0.65 IP_AMT_INSTALMENT_avg 0.31 AT_ANNUITY_INCOME_PERCENT -0.07 AT_CREDIT_TERM -0.04 AT_DAYS_EMPLOYED_PERCENT -0.02 Name: PA_AMT_CREDIT_sum, dtype: float64 PA_DAYS_DECISION_max_y AMT_ANNUITY 0.02 DAYS_BIRTH 0.03 DAYS_REGISTRATION 0.04 DAYS_ID_PUBLISH 0.03 EXT_SOURCE_2 0.02 EXT_SOURCE_3 0.09 b_DAYS_CREDIT_max -0.03 PA_AMT_CREDIT_sum -0.18 PA_DAYS_DECISION_max_y 1.00 IP_DAYS_INSTALMENT_min 0.26 IP_AMT_INSTALMENT_sum -0.12 IP_AMT_INSTALMENT_avg -0.07 AT_ANNUITY_INCOME_PERCENT 0.08 AT_CREDIT_TERM -0.01 AT_DAYS_EMPLOYED_PERCENT -0.00 Name: PA_DAYS_DECISION_max_y, dtype: float64 IP_DAYS_INSTALMENT_min AMT_ANNUITY 0.02 DAYS_BIRTH 0.13 DAYS_REGISTRATION 0.07 DAYS_ID_PUBLISH 0.12 EXT_SOURCE_2 0.05 EXT_SOURCE_3 0.09 b_DAYS_CREDIT_max -0.05 PA_AMT_CREDIT_sum 0.23 PA_DAYS_DECISION_max_y 0.26 IP_DAYS_INSTALMENT_min 1.00 IP_AMT_INSTALMENT_sum 0.35 IP_AMT_INSTALMENT_avg -0.02 AT_ANNUITY_INCOME_PERCENT 0.03 AT_CREDIT_TERM -0.03 AT_DAYS_EMPLOYED_PERCENT -0.02 Name: IP_DAYS_INSTALMENT_min, dtype: float64 IP_AMT_INSTALMENT_sum AMT_ANNUITY 0.14 DAYS_BIRTH 0.15 DAYS_REGISTRATION 0.03 DAYS_ID_PUBLISH 0.04 EXT_SOURCE_2 0.06 EXT_SOURCE_3 -0.01 b_DAYS_CREDIT_max -0.02 PA_AMT_CREDIT_sum 0.65 PA_DAYS_DECISION_max_y -0.12 IP_DAYS_INSTALMENT_min 0.35 IP_AMT_INSTALMENT_sum 1.00 IP_AMT_INSTALMENT_avg 0.41 AT_ANNUITY_INCOME_PERCENT -0.05 AT_CREDIT_TERM -0.05 AT_DAYS_EMPLOYED_PERCENT -0.03 Name: IP_AMT_INSTALMENT_sum, dtype: float64 IP_AMT_INSTALMENT_avg AMT_ANNUITY 0.16 DAYS_BIRTH 0.06 DAYS_REGISTRATION -0.01 DAYS_ID_PUBLISH 0.01 EXT_SOURCE_2 0.06 EXT_SOURCE_3 -0.00 b_DAYS_CREDIT_max -0.00 PA_AMT_CREDIT_sum 0.31 PA_DAYS_DECISION_max_y -0.07 IP_DAYS_INSTALMENT_min -0.02 IP_AMT_INSTALMENT_sum 0.41 IP_AMT_INSTALMENT_avg 1.00 AT_ANNUITY_INCOME_PERCENT -0.01 AT_CREDIT_TERM -0.02 AT_DAYS_EMPLOYED_PERCENT 0.00 Name: IP_AMT_INSTALMENT_avg, dtype: float64 AT_ANNUITY_INCOME_PERCENT AMT_ANNUITY 0.48 DAYS_BIRTH 0.08 DAYS_REGISTRATION 0.03 DAYS_ID_PUBLISH 0.02 EXT_SOURCE_2 -0.04 EXT_SOURCE_3 0.09 b_DAYS_CREDIT_max -0.05 PA_AMT_CREDIT_sum -0.07 PA_DAYS_DECISION_max_y 0.08 IP_DAYS_INSTALMENT_min 0.03 IP_AMT_INSTALMENT_sum -0.05 IP_AMT_INSTALMENT_avg -0.01 AT_ANNUITY_INCOME_PERCENT 1.00 AT_CREDIT_TERM -0.03 AT_DAYS_EMPLOYED_PERCENT -0.08 Name: AT_ANNUITY_INCOME_PERCENT, dtype: float64 AT_CREDIT_TERM AMT_ANNUITY -0.06 DAYS_BIRTH -0.09 DAYS_REGISTRATION -0.03 DAYS_ID_PUBLISH -0.02 EXT_SOURCE_2 -0.05 EXT_SOURCE_3 0.00 b_DAYS_CREDIT_max 0.00 PA_AMT_CREDIT_sum -0.04 PA_DAYS_DECISION_max_y -0.01 IP_DAYS_INSTALMENT_min -0.03 IP_AMT_INSTALMENT_sum -0.05 IP_AMT_INSTALMENT_avg -0.02 AT_ANNUITY_INCOME_PERCENT -0.03 AT_CREDIT_TERM 1.00 AT_DAYS_EMPLOYED_PERCENT 0.02 Name: AT_CREDIT_TERM, dtype: float64 AT_DAYS_EMPLOYED_PERCENT AMT_ANNUITY 0.10 DAYS_BIRTH -0.59 DAYS_REGISTRATION -0.20 DAYS_ID_PUBLISH -0.25 EXT_SOURCE_2 0.02 EXT_SOURCE_3 -0.10 b_DAYS_CREDIT_max 0.02 PA_AMT_CREDIT_sum -0.02 PA_DAYS_DECISION_max_y -0.00 IP_DAYS_INSTALMENT_min -0.02 IP_AMT_INSTALMENT_sum -0.03 IP_AMT_INSTALMENT_avg 0.00 AT_ANNUITY_INCOME_PERCENT -0.08 AT_CREDIT_TERM 0.02 AT_DAYS_EMPLOYED_PERCENT 1.00 Name: AT_DAYS_EMPLOYED_PERCENT, dtype: float64
We observe the correlation coefficients between the pair of these independent variables < 0.7 indicating low to moderate correlation!
Multicollinearity does not affect Tree based algorithms such as Tree based algorithms, but affects the Parametric algorithms like linear , logistic , Naive Bayes etc.
feature_num = 1
for each in colsnm:
print("\n%d: Feature %s " %(feature_num, each))
feature_num += 1
plot_distribution(data,each ,"blue")
normal_distribution_check(data[each])
1: Feature AMT_ANNUITY Feature: AMT_ANNUITY range(1615.5, 258025.5) Mean: 27108.48784108536 Median: 24903.0 Skewness: 1.5798237246955467 Kurtosis: 7.7077559537369025 The distribution is positively skewed (skewed to the right). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.912364661693573, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 2: Feature DAYS_BIRTH Feature: DAYS_BIRTH range(21.0, 69.0) Mean: 43.93864609721278 Median: 43.0 Skewness: 0.1151343538689001 Kurtosis: -1.0472266784228774 The distribution is approximately symmetric. The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9700958132743835, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 3: Feature DAYS_REGISTRATION Feature: DAYS_REGISTRATION range(0.0, 68.0) Mean: 13.659439824916833 Median: 12.0 Skewness: 0.5885810266508744 Kurtosis: -0.3217250717568527 The distribution is positively skewed (skewed to the right). The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9536600708961487, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 4: Feature DAYS_ID_PUBLISH Feature: DAYS_ID_PUBLISH range(0.0, 20.0) Mean: 8.198575010324834 Median: 9.0 Skewness: -0.34966073846663426 Kurtosis: -1.0891039551894959 The distribution is approximately symmetric. The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9385840892791748, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 5: Feature EXT_SOURCE_2 Feature: EXT_SOURCE_2 range(8.173616518884397e-08, 0.8549996664047012) Mean: 0.514503354322039 Median: 0.5659614260608526 Skewness: -0.7959400362910248 Kurtosis: -0.26225703458375627 The distribution is negatively skewed (skewed to the left). The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9266524910926819, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 6: Feature EXT_SOURCE_3 Feature: EXT_SOURCE_3 range(0.0005272652387098, 0.8960095494948396) Mean: 0.5156949091808121 Median: 0.5352762504724826 Skewness: -0.53769669691455 Kurtosis: -0.04869212165958725 The distribution is negatively skewed (skewed to the left). The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9648309350013733, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 7: Feature b_DAYS_CREDIT_max Feature: b_DAYS_CREDIT_max range(-2922.0, 0.0) Mean: -419.25515184822655 Median: -239.0 Skewness: -2.241816733472214 Kurtosis: 5.523059055120931 The distribution is negatively skewed (skewed to the left). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.7388038635253906, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 8: Feature PA_AMT_CREDIT_sum Feature: PA_AMT_CREDIT_sum range(0.0, 41461128.0) Mean: 902685.9269682546 Median: 383778.0 Skewness: 5.077017601460826 Kurtosis: 50.975539142218324 The distribution is positively skewed (skewed to the right). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.5743550658226013, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 9: Feature PA_DAYS_DECISION_max_y Feature: PA_DAYS_DECISION_max_y range(0.0, 8.0) Mean: 1.228359310723844 Median: 1.0 Skewness: 2.0723345505200146 Kurtosis: 4.771499859727258 The distribution is positively skewed (skewed to the right). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.7318885922431946, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 10: Feature IP_DAYS_INSTALMENT_min Feature: IP_DAYS_INSTALMENT_min range(0.0, 8.0) Mean: 4.131913980312899 Median: 4.0 Skewness: 0.06045041625911667 Kurtosis: -1.3808581068332964 The distribution is approximately symmetric. The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9114291667938232, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 11: Feature IP_AMT_INSTALMENT_sum Feature: IP_AMT_INSTALMENT_sum range(0.0, 23274726.93) Mean: 637718.2564077219 Median: 298091.745 Skewness: 2.8553275679162686 Kurtosis: 13.005337442872378 The distribution is positively skewed (skewed to the right). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.6867349147796631, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 12: Feature IP_AMT_INSTALMENT_avg Feature: IP_AMT_INSTALMENT_avg range(0.0, 2504589.66) Mean: 17443.52688302848 Median: 11954.42052631579 Skewness: 18.351061680164857 Kurtosis: 1108.8648749117804 The distribution is positively skewed (skewed to the right). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.48236823081970215, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 13: Feature AT_ANNUITY_INCOME_PERCENT Feature: AT_ANNUITY_INCOME_PERCENT range(0.0002238846153846, 1.8759649122807016) Mean: 0.1809289211110611 Median: 0.1628333333333333 Skewness: 1.5120203936578784 Kurtosis: 5.451605611925493 The distribution is positively skewed (skewed to the right). The distribution has heavier tails (more extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.9120789766311646, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 14: Feature AT_CREDIT_TERM Feature: AT_CREDIT_TERM range(0.0167896919664934, 0.1581142857142857) Mean: 0.053695336136833555 Median: 0.05 Skewness: 1.1170129589310844 Kurtosis: 0.5276272337419181 The distribution is positively skewed (skewed to the right). The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.8819172382354736, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed) 15: Feature AT_DAYS_EMPLOYED_PERCENT Feature: AT_DAYS_EMPLOYED_PERCENT range(-47.48966324275127, 0.7288114904178704) Mean: -2.9201348735243022 Median: 0.088644630713269 Skewness: -1.7598345904832133 Kurtosis: 1.4141048013133508 The distribution is negatively skewed (skewed to the left). The distribution has lighter tails (less extreme values).
Normal Distribution Check using Shapiro test: Test Statistic: 0.5076528787612915, P-value: 0.0 Reject the null hypothesis (data may not be normally distributed)
for var in colsnm:
lower_bound, upper_bound = identify_outliers(data, var)
print("Statistics for %s" % var)
print(data[var].describe())
Statistics for AMT_ANNUITY count 307511.000000 mean 27108.487841 std 14493.461065 min 1615.500000 25% 16524.000000 50% 24903.000000 75% 34596.000000 max 258025.500000 Name: AMT_ANNUITY, dtype: float64
Statistics for DAYS_BIRTH count 307511.000000 mean 43.938646 std 11.964047 min 21.000000 25% 34.000000 50% 43.000000 75% 54.000000 max 69.000000 Name: DAYS_BIRTH, dtype: float64
Statistics for DAYS_REGISTRATION count 307511.000000 mean 13.659440 std 9.659369 min 0.000000 25% 6.000000 50% 12.000000 75% 20.000000 max 68.000000 Name: DAYS_REGISTRATION, dtype: float64
Statistics for DAYS_ID_PUBLISH count 307511.000000 mean 8.198575 std 4.151520 min 0.000000 25% 5.000000 50% 9.000000 75% 12.000000 max 20.000000 Name: DAYS_ID_PUBLISH, dtype: float64
Statistics for EXT_SOURCE_2 count 3.075110e+05 mean 5.145034e-01 std 1.908699e-01 min 8.173617e-08 25% 3.929737e-01 50% 5.659614e-01 75% 6.634218e-01 max 8.549997e-01 Name: EXT_SOURCE_2, dtype: float64
Statistics for EXT_SOURCE_3 count 307511.000000 mean 0.515695 std 0.174736 min 0.000527 25% 0.417100 50% 0.535276 75% 0.636376 max 0.896010 Name: EXT_SOURCE_3, dtype: float64
Statistics for b_DAYS_CREDIT_max count 307511.000000 mean -419.255152 std 526.292236 min -2922.000000 25% -537.000000 50% -239.000000 75% -80.000000 max 0.000000 Name: b_DAYS_CREDIT_max, dtype: float64
Statistics for PA_AMT_CREDIT_sum count 3.075110e+05 mean 9.026859e+05 std 1.462142e+06 min 0.000000e+00 25% 1.309185e+05 50% 3.837780e+05 75% 1.090627e+06 max 4.146113e+07 Name: PA_AMT_CREDIT_sum, dtype: float64
Statistics for PA_DAYS_DECISION_max_y count 307511.000000 mean 1.228359 std 1.516777 min 0.000000 25% 0.000000 50% 1.000000 75% 2.000000 max 8.000000 Name: PA_DAYS_DECISION_max_y, dtype: float64
Statistics for IP_DAYS_INSTALMENT_min count 307511.000000 mean 4.131914 std 2.643379 min 0.000000 25% 2.000000 50% 4.000000 75% 7.000000 max 8.000000 Name: IP_DAYS_INSTALMENT_min, dtype: float64
Statistics for IP_AMT_INSTALMENT_sum count 3.075110e+05 mean 6.377183e+05 std 8.698961e+05 min 0.000000e+00 25% 1.147045e+05 50% 2.980917e+05 75% 7.964169e+05 max 2.327473e+07 Name: IP_AMT_INSTALMENT_sum, dtype: float64
Statistics for IP_AMT_INSTALMENT_avg count 3.075110e+05 mean 1.744353e+04 std 2.308920e+04 min 0.000000e+00 25% 7.100846e+03 50% 1.195442e+04 75% 2.059924e+04 max 2.504590e+06 Name: IP_AMT_INSTALMENT_avg, dtype: float64
Statistics for AT_ANNUITY_INCOME_PERCENT count 307511.000000 mean 0.180929 std 0.094573 min 0.000224 25% 0.114782 50% 0.162833 75% 0.229067 max 1.875965 Name: AT_ANNUITY_INCOME_PERCENT, dtype: float64
Statistics for AT_CREDIT_TERM count 307511.000000 mean 0.053695 std 0.022482 min 0.016790 25% 0.036900 50% 0.050000 75% 0.064043 max 0.158114 Name: AT_CREDIT_TERM, dtype: float64
Statistics for AT_DAYS_EMPLOYED_PERCENT count 307511.000000 mean -2.920135 std 6.627098 min -47.489663 25% 0.021559 50% 0.088645 75% 0.191000 max 0.728811 Name: AT_DAYS_EMPLOYED_PERCENT, dtype: float64
We can treat outliers either by winsorization or scaling.
Winsorization replaces extreme values with a specified percentile value. For example, you can replace values above the 95th percentile with the value at the 95th percentile and values below the 5th percentile with the value at the 5th percentile. When can you apply the winsorization to treat outliers? Below are cases when you can use the winsorization technique to treat outliers:
When you want to keep the data distribution intact: Winsorization can be useful when you want to address outliers but still maintain the original data distribution to some extent.
When you have some tolerance for outliers: Winsorization allows you to set a threshold for how extreme values should be replaced, making it a flexible approach.
We will use winsorization technique using IQR method.
df_clean = data.copy()
cap_vars = ['AMT_ANNUITY','DAYS_REGISTRATION', 'EXT_SOURCE_3', 'b_DAYS_CREDIT_max', 'PA_AMT_CREDIT_sum',\
'PA_DAYS_DECISION_max_y','IP_AMT_INSTALMENT_sum','IP_AMT_INSTALMENT_avg',\
'AT_ANNUITY_INCOME_PERCENT', 'AT_CREDIT_TERM','AT_DAYS_EMPLOYED_PERCENT']
for each in cap_vars:
VAR = each
df_clean[VAR] = cap_outliers(df_clean, VAR)
df_clean.boxplot(column = [VAR])
plt.show()
We have removed the outliers successfully as revealed in the above boxplots. Now, we can use the data for EDA.
A two-sample t-test is performed when you want to compare the means of two independent groups to determine if there is a statistically significant difference between them. In the context of one numeric and one categorical variable with two categories, the goal is to assess whether the means of the numeric variable are different between the two categories of the categorical variable.
Determine if there is a statistically significant difference in the means of the numeric variable between the two categories of the categorical variable.
Null Hypothesis (H0):
The means of the numeric variable are equal between the two categories.
Alternative Hypothesis (H1):
The means of the numeric variable are not equal between the two categories.
1.To test whether there is evidence to reject the null hypothesis and conclude that there is a significant difference in the means of the numeric variable between the two categories.
2.The t-test assesses whether the observed difference in means is statistically significant, taking into account the variability within each group.
For a large sample size, the population is approximately normally distributed, regardless of the distribution of the population one samples from. If the population has mean, $\mu$ and standard deviation, $\sigma$, then $\bar{x}$ has mean $\mu$ and standard deviation $\sigma$ /$\sqrt{n}$.
If the population is skewed, then the distribution of sample mean looks more and more normal when it gets larger.
A two-sample t-test helps you assess whether the means of a numeric variable differ significantly between two categories of a categorical variable, providing insights into potential group differences.
One of the main assumption of t test is The distribution is approximately normal.
Our dataset has more than 300K rows and hence the distribution is approximately normal.
Reference: https://www.jmp.com/en_in/statistics-knowledge-portal/t-test.html
print("Two Sample t-test for all the feature :\n")
for feature in df_clean.drop("TARGET",axis=1).columns:
print(feature,":\n")
group_0 = df_clean[df_clean['TARGET'] == 0][feature]
group_1 = df_clean[df_clean['TARGET'] == 1][feature]
t_statistic, p_value = stats.ttest_ind(group_0, group_1,alternative='two-sided')
print(f'T-statistic: {t_statistic}')
print(f'P-value: {p_value}')
alpha = 0.025
if p_value < alpha:
print("Reject the null hypothesis: There is a significant difference in means.\n\n")
else:
print("Fail to reject the null hypothesis: There is no significant difference in means.\n\n")
Two Sample t-test for all the feature : AMT_ANNUITY : T-statistic: 5.320846526733901 P-value: 1.0335752405588428e-07 Reject the null hypothesis: There is a significant difference in means. DAYS_BIRTH : T-statistic: 43.53317094646741 P-value: 0.0 Reject the null hypothesis: There is a significant difference in means. DAYS_REGISTRATION : T-statistic: 23.20234929909641 P-value: 5.459972083536739e-119 Reject the null hypothesis: There is a significant difference in means. DAYS_ID_PUBLISH : T-statistic: 28.51487601156059 P-value: 1.3119142604492716e-178 Reject the null hypothesis: There is a significant difference in means. EXT_SOURCE_2 : T-statistic: 90.05350588929981 P-value: 0.0 Reject the null hypothesis: There is a significant difference in means. EXT_SOURCE_3 : T-statistic: 87.10705100677774 P-value: 0.0 Reject the null hypothesis: There is a significant difference in means. b_DAYS_CREDIT_max : T-statistic: -35.21855336687336 P-value: 3.62749664885773e-271 Reject the null hypothesis: There is a significant difference in means. PA_AMT_CREDIT_sum : T-statistic: -3.115855642211741 P-value: 0.0018342890681729625 Reject the null hypothesis: There is a significant difference in means. PA_DAYS_DECISION_max_y : T-statistic: 7.266283501052061 P-value: 3.7038251334822315e-13 Reject the null hypothesis: There is a significant difference in means. IP_DAYS_INSTALMENT_min : T-statistic: 25.886197465694 P-value: 1.373032097872596e-147 Reject the null hypothesis: There is a significant difference in means. IP_AMT_INSTALMENT_sum : T-statistic: 11.355364647612658 P-value: 7.071197863301575e-30 Reject the null hypothesis: There is a significant difference in means. IP_AMT_INSTALMENT_avg : T-statistic: 11.084197990260735 P-value: 1.5156363902925323e-28 Reject the null hypothesis: There is a significant difference in means. AT_ANNUITY_INCOME_PERCENT : T-statistic: -8.586180676281883 P-value: 9.031707841975961e-18 Reject the null hypothesis: There is a significant difference in means. AT_CREDIT_TERM : T-statistic: -7.026796629357176 P-value: 2.117651195769695e-12 Reject the null hypothesis: There is a significant difference in means. AT_DAYS_EMPLOYED_PERCENT : T-statistic: 0.8005022581812614 P-value: 0.42342047416180584 Fail to reject the null hypothesis: There is no significant difference in means.
1.The average AMT_ANNUITY is significantly different between the two groups.
2.The average age (DAYS_BIRTH) is significantly different between the two groups.
3.The average number of days since registration is significantly different between the two groups.
4.The average number of days since ID publication is significantly different between the two groups.
5.The average EXT_SOURCE_2 is significantly different between the two groups.
6.The average EXT_SOURCE_3 is significantly different between the two groups.
7.The sum of PA_AMT_CREDIT is significantly different between the two groups.
8.The maximum of PA_DAYS_DECISION in the last year is significantly different between the two groups.
9.The minimum of IP_DAYS_INSTALMENT is significantly different between the two groups.
10.The sum of IP_AMT_INSTALMENT is significantly different between the two groups.
11.The average of IP_AMT_INSTALMENT is significantly different between the two groups.
12.The ratio of credit amount to income is significantly different between the two groups.
13.The ratio of annuity amount to income is significantly different between the two groups.
14.The credit term is significantly different between the two groups.
15.There is no significant difference in the ratio of days employed to income between the two groups.
In summary, for most of the features, there is a significant difference in means between the two groups, indicating potential distinctions between the groups for these variables. However, for the feature AT_DAYS_EMPLOYED_PERCENT, there is no significant difference in means. These results provide insights into the characteristics that vary significantly between the two groups in your dataset.
Usually, the Mann-Whitney U test is used when the data is ordinal or when the assumptions of the t-test are not met.
We will not use Mann-Whitney U test as the assumptions of t test are met.
df_clean_ = df_clean.drop(['AT_DAYS_EMPLOYED_PERCENT'], axis = 1)
distribution_of_target(df_clean_)
TARGET 0 282686 1 24825 Name: count, dtype: int64 Percentage: TARGET 0 91.927118 1 8.072882 Name: count, dtype: float64
We have 307511 rows and 16 columns and all are numeric columns.
Our target variable, is having two classes 0 (Non-defaulter) and 1 (Defaulter) with count and % as follows:
One approach to addressing imbalanced datasets is to oversample the minority class. The simplest approach involves duplicating examples in the minority class, although these examples don’t add any new information to the model. Instead, new examples can be synthesized from the existing examples. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or SMOTE for short.
Refer: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
We have converted absolute value of days into years by dividing by 365 for the following variables: 1) DAYS_BIRTH 2) DAYS_REGISTRATION 3) DAYS_ID_PUBLISH 4) PA_DAYS_DECISION_max_y 5) IP_DAYS_INSTALMENT_min
X = df_clean_.drop("TARGET",axis=1)
y = df_clean_["TARGET"]
ss = StandardScaler()
df_scaled = ss.fit_transform(X)
X = pd.DataFrame(df_scaled, columns=X.columns)
print(X.head())
AMT_ANNUITY DAYS_BIRTH DAYS_REGISTRATION DAYS_ID_PUBLISH EXT_SOURCE_2 \ 0 -0.157903 -1.499382 -0.379141 -0.529584 -1.317940 1 0.670180 0.172296 -1.106990 -1.733964 0.564482 2 -1.509469 0.673799 -0.171184 -0.288708 0.216948 3 0.217513 0.673799 1.388494 -0.288708 0.712205 4 -0.371362 0.924551 -0.171184 0.193044 -1.004691 EXT_SOURCE_3 b_DAYS_CREDIT_max PA_AMT_CREDIT_sum PA_DAYS_DECISION_max_y \ 0 -2.173696 0.709302 -0.694920 0.633999 1 0.109944 -0.634460 0.905583 0.633999 2 1.230654 -0.105504 -0.894680 0.633999 3 0.109944 0.984466 2.259885 -0.889746 4 0.109944 -2.085082 0.336598 -0.127874 IP_DAYS_INSTALMENT_min IP_AMT_INSTALMENT_sum IP_AMT_INSTALMENT_avg \ 0 -0.806512 -0.566252 -0.328256 1 0.706705 1.881387 2.263770 2 -0.806512 -0.913196 -0.723251 3 -1.184816 0.811342 2.263770 4 0.706705 0.511924 -0.230266 AT_ANNUITY_INCOME_PERCENT AT_CREDIT_TERM 0 -0.658522 0.336442 1 -0.539295 -1.185411 2 -0.914442 -0.157022 3 0.481736 1.906089 4 0.016688 -0.495685
Out dataset has 307511 rows. We had 134 independent variables.
Recursive Feature Elimination, or RFE for short, is a popular feature selection algorithm in a dataset that are more or most relevant in predicting the target variable.
RFE applies a backward selection process to find the best combination of features. This is done as follows:
Builds a model based on all features and calculates the importance of each feature in the model. It ranks the features and removes the feature(s) with the least importance iteratively based on model evaluation metrics such as accuracy ratio, recall.
Using the RFECV class, we achieved the feature selection by performing cross-validation evaluation of different numbers of features and automatically selecting the number of features that resulted in the best mean score.
The RFECV is configured by specifying the minimum number of features via the “min_features_to_select” argument (defaults to 1) and we can also specify the type of cross-validation and scoring to use via the “cv” (defaults to 5) and “scoring” arguments (uses accuracy for classification).
We selected the 15 independent variables by performing RFE: ['AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'b_DAYS_CREDIT_max', 'PA_AMT_CREDIT_sum', 'PA_DAYS_DECISION_max_y', 'IP_DAYS_INSTALMENT_min', 'IP_AMT_INSTALMENT_sum', 'IP_AMT_INSTALMENT_avg', 'AT_ANNUITY_INCOME_PERCENT', 'AT_CREDIT_TERM', 'AT_DAYS_EMPLOYED_PERCENT']
Now, we have 307511 rows and 16 columns in our dataset.
Dimensionality reduction is used to reduce the number of input variables in the training data, by reducing the dimension of your feature set. When a model has a high number of features, it is naturally more complex leading to a higher chance of overfitting and decrease in accuracy.
https://www.kdnuggets.com/2022/07/machine-learning-algorithms-explained-less-1-minute.html
While both feature selection and dimensionality reduction aim to reduce the number of features, they differ in their approach. Common dimationality reduction techniques include Principal Component Analysis (PCA), Factor Analysis and Linear Discriminant Analysis (LDA). We have applied Feature selection to reduce the number of features.
https://www.kdnuggets.com/2022/09/dimensionality-reduction-techniques-data-science.html
We are not going to apply dimensionality reduction techniques such as PCA, Factor Analysis and LDA.
filename_ = "Final Houseing Loan data" + str(datetime.datetime.now().strftime("%Y%m%d"))+".csv"
print(filename_)
Final Houseing Loan data20240128.csv
df_final = X
df_final['TARGET'] = y
df_final.to_csv(filename_, index = False)